Disclaimer : This is the Markdown file is for Task B
rm(list = ls())
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v stringr 1.4.0
## v tidyr 1.2.0 v forcats 0.5.1
## v readr 2.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(ggplot2)
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(rgdal)
## Warning: package 'rgdal' was built under R version 4.1.3
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
##
## rgdal: version: 1.5-29, (SVN revision 1165M)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was C:/Users/Pranav/Documents/R/win-library/4.1/rgdal/proj
library(geojsonio)
## Warning: package 'geojsonio' was built under R version 4.1.3
## Registered S3 method overwritten by 'geojsonsf':
## method from
## print.geojson geojson
##
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
##
## pretty
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:ggpubr':
##
## get_legend
library(leaflet)
library(reshape)
## Warning: package 'reshape' was built under R version 4.1.3
##
## Attaching package: 'reshape'
## The following object is masked from 'package:cowplot':
##
## stamp
## The following object is masked from 'package:lubridate':
##
## stamp
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:plyr':
##
## rename, round_any
library(raster)
## Warning: package 'raster' was built under R version 4.1.3
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
library(RColorBrewer)
library(spatialEco)
##
## Attaching package: 'spatialEco'
## The following object is masked from 'package:raster':
##
## shift
## The following object is masked from 'package:dplyr':
##
## combine
library(htmltools)
B1. Create a GeoJSON file where each postcode is represented with a latitude, longitude value, together with minimum, maximum, mean and median house price.
Ans: We create a GeoJSON file with the required information
We load the required data. We remove the ID Column from the postcodes data set as it is a redundant column. Then we create a column to store the years of our original dataset.This is done for ease of viewing and grouping data ahead.
setwd("D:\\BSE\\BSE Material\\sem 2\\Data Vis\\Project")
pp_data <- read.csv("ppdata_lite.csv")
# Load file with postcodes and latitude/longitude
ukpostcodes <- read.csv("ukpostcodes.csv", header = TRUE, sep = ',')
#id seems redundant
ukpostcodes <- ukpostcodes[-1]
#selecting only required data
ppdata <- pp_data %>%
mutate(year = as.POSIXlt(date_of_transfer)$year +1900)
We create a seperate dataframe which would contain postcodes,prices and years.And find the mean,max,min and median for the postcodes available.
## # A tibble: 6 x 5
## postcode mean_price median_price max_price min_price
## <chr> <dbl> <dbl> <int> <int>
## 1 "" 280211. 135000 44033000 750
## 2 "AL1 1AJ" 203018. 159995 435000 101950
## 3 "AL1 1AR" 300000 300000 350000 250000
## 4 "AL1 1AS" 356667. 285000 500000 285000
## 5 "AL1 1BH" 134500 170000 187000 55500
## 6 "AL1 1BX" 731667. 725000 1200000 270000
We then remove the first row as it contains values starting from 0 which we would not require and throws an error when we merge the data with ukpostcodes. We create a spatial dataset and create the required GeoJSON file.
## postcode mean_price median_price max_price min_price
## 1 AL1 1AJ 203018.4 159995 435000 101950
## 2 AL1 1AR 300000.0 300000 350000 250000
## 3 AL1 1AS 356666.7 285000 500000 285000
## 4 AL1 1BH 134500.0 170000 187000 55500
## 5 AL1 1BX 731666.7 725000 1200000 270000
## 6 AL1 1BZ 265000.0 265000 390000 140000
B2. Open the GeoJSON file in the GIS application of your choice and colour-code the data to give an overview of areas with high, medium and low median house price. Additionally, you can visualise this information as cloropleths or use shiny and add the information as markers on a map for a more interactive and impressive
Ans: The question requires us to use a GIS application and view areas with high,medium and low median house prices. On further discussion with colleagues, I use the Area shapial data which contains the first( in some cases first two) letters of the postcodes. This allowed in reducing the running time, a major constraining factor while computing on the device used and helped in displaying the spatial data better for the required plot.
We create a dataset this time by first getting the first using the old dataset and then mutating the postcode column by only keeping the first or the first two letters of the postal code. We then as instructed find the mean,median,max and min prices of the houses according to these post codes.
Dataset <- pp_data
Dataset$postcode <- gsub('[[:digit:]]+', '', Dataset$postcode)
Dataset$postcode <- substr(Dataset$postcode,start = 1,stop = 2)
Dataset <- na.omit(Dataset)
Dataset <- Dataset%>%
group_by(postcode)%>%
summarise_at(vars(price),list(mean_price = mean,
median_price = median,
max_price = max,
min_price = min))
Dataset <- Dataset[-1,]
head(Dataset)
## # A tibble: 6 x 5
## postcode mean_price median_price max_price min_price
## <chr> <dbl> <dbl> <int> <int>
## 1 "AL" 274660. 214998. 10004563 375
## 2 "B " 134297. 110000 20000000 700
## 3 "BA" 172719. 142000 4025000 3000
## 4 "BB" 82682. 61500 4050000 150
## 5 "BD" 100427. 82000 5875000 2000
## 6 "BH" 193451. 163000 19972500 1000
We get our Area specific spatial data and combine it with the dataset created to obtain a spaital dataset which contains the longitude,latitude, price statistics and postcodes of the data that is to be plotted
Area <- shapefile("shapes/Areas.shp")
class(Area)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
Map_data <- merge(Area,Dataset,by.x = 'name',by.y = "postcode")
head(Map_data)
## name mean_price median_price max_price min_price
## 1 AB NA NA NA NA
## 2 AL 274660.23 214997.5 10004563 375
## 3 B NA NA NA NA
## 4 BA 172719.47 142000.0 4025000 3000
## 5 BB 82681.88 61500.0 4050000 150
## 6 BD 100427.21 82000.0 5875000 2000
Map_data <- sp.na.omit(Map_data)
## Deleting rows: 132331333638465152545556646768777986110120121122123124
Before Plotting we need to defined how we will divide our data to show different house prices and if they are low,medium or high. Instead of sticking to three catergories, I have chosen to divide the values into 6 parts based on its percentiles. We then assign colours to these required intervals.
intervals = quantile(Map_data$mean_price, probs = c(0.167,0.33,0.5,0.667,0.833,1),names = F ,na.rm = T)
values <- append(intervals,0,0)
factpal <- colorBin("PRGn", bins = values , domain =Map_data$mean_price)
We then try to display the UK property prices using these intervals based on the postcodes.
mapplot_mean <- leaflet(Map_data) %>% setView(lng=-2, lat=52.2783, zoom = 8) %>%
addProviderTiles("Stamen.TonerHybrid") %>%
addPolygons(fillColor = ~factpal(Map_data$mean_price),weight = 0.2,fillOpacity = 0.5,
smoothFactor = 0.2)%>%
addLegend(pal = factpal,
values = Map_data$mean_price,
title = "Mean HP data")
mapplot_mean
We now use Median instead of mean, as frequency of different properties might now display the true average value of property prices.Median is better for skewed distributions, so chosing median would gives us a better, more robust and a sensible plot. Median free’s us form the disadvantage of means considering not just the values but also their occurences.
intervals = quantile(Map_data$median_price, probs = c(0.167,0.33,0.5,0.667,0.833,1),names = F ,na.rm = T)
values <- append(intervals,0,0)
factpal <- colorBin("PRGn", bins = values , domain =Map_data$median_price)
mapplot_median <- leaflet(Map_data) %>% setView(lng=-2, lat=52.2783, zoom = 8) %>%
addProviderTiles("Stamen.TonerHybrid") %>%
addPolygons(fillColor = ~factpal(Map_data$median_price),weight = 0.5,fillOpacity = 0.5,
smoothFactor = 0.25)%>%
addLegend(pal = factpal,
values = Map_data$median_price,
title = "Median HP data")
mapplot_median